Python 源码为了安全而编译为 C 代码
· 阅读需 7 分钟
背景
做大模型相关的需求,对于 python 代码需要混淆,不要让客户知道源码,因为 python 是解释型语言,直接 python main.py
就运行了,不像
go 一样可以打包为二进制文件,用户不知道源码是什么样子的。因此 python 代码需要做混淆。
Cython
调研了一圈后发现,大家都在使用 Cython
将 python 代码编译为 C
代码,从而混淆源 码。达到不让对方直接看到源码的诉求。找到了一个工具 https://github.com/Boris-code/jmpy
,但是这个仓库中依赖的
Cython 实在太久了,因此直接把源码拿出来,直接修改了一下源代码。在这里感谢 jmpy
这个项目的作者。
步骤
将这个文件保存为 entrypt_py.py
文件。
# -*- coding: utf-8 -*-
"""
Created on 2018-07-18 18:24
---------
@summary: 加密python代码为pyd/so
---------
@author: Boris
"""
import os
import re
import getopt
import sys
import shutil
import tempfile
from asyncio.log import logger
from distutils.command.build_py import build_py
from distutils.core import setup
from typing import Union, List
from Cython.Build import cythonize
def get_package_dir(*args, **kwargs):
return ""
# 重写get_package_dir, 否者生成的so文件路径有问题
build_py.get_package_dir = get_package_dir
class TemporaryDirectory(object):
def __enter__(self):
self.name = tempfile.mkdtemp()
return self.name
def __exit__(self, exc_type, exc_value, traceback):
shutil.rmtree(self.name)
def search(content, regexs):
if isinstance(regexs, str):
return re.search(regexs, content)
for regex in regexs:
if re.search(regex, content):
return True
def walk_file(file_path):
if os.path.isdir(file_path):
for current_path, sub_folders, files_name in os.walk(file_path):
for file in files_name:
file_path = os.path.join(current_path, file)
yield file_path
else:
yield file_path
def copy_files(src_path, dst_path):
if os.path.isdir(src_path):
if os.path.exists(dst_path):
shutil.rmtree(dst_path)
def callable(src, names: list):
if search(src, dst_path):
return names
return ["dist", ".git", "venv", ".idea", "__pycache__"]
shutil.copytree(src_path, dst_path, ignore=callable)
else:
if not os.path.exists(dst_path):
os.makedirs(dst_path)
shutil.copyfile(src_path, os.path.join(dst_path, os.path.basename(src_path)))
def get_py_files(files, ignore_files: Union[List, str, None] = None):
"""
@summary:
---------
@param files: 文件列表
#param ignore_files: 忽略的文件,支持正则
---------
@result:
"""
for file in files:
if file.endswith(".py"):
if ignore_files and search(file, regexs=ignore_files): # 该文件是忽略的文件
pass
else:
yield file
def filter_cannot_encrypted_py(files, except_main_file):
"""
过滤掉不能加密的文件,如 log.py __main__.py 以及包含 if __name__ == "__main__": 的文件
Args:
files:
Returns:
"""
_files = []
for file in files:
if search(file, regexs="__.*?.py"):
continue
if except_main_file:
with open(file, "r", encoding="utf-8") as f:
content = f.read()
if search(content, regexs="__main__"):
continue
_files.append(file)
return _files
def encrypt_py(py_files: list):
encrypted_py = []
with TemporaryDirectory() as td:
total_count = len(py_files)
for i, py_file in enumerate(py_files):
try:
dir_name = os.path.dirname(py_file)
file_name = os.path.basename(py_file)
os.chdir(dir_name)
logger.debug("正在加密 {}/{}, {}".format(i + 1, total_count, file_name))
setup(
ext_modules=cythonize([file_name], quiet=True, language_level=3),
script_args=["build_ext", "-t", td, "--inplace"],
)
encrypted_py.append(py_file)
logger.debug("加密成功 {}".format(file_name))
except Exception as e:
logger.exception("加密失败 {} , error {}".format(py_file, e))
temp_c = py_file.replace(".py", ".c")
if os.path.exists(temp_c):
os.remove(temp_c)
return encrypted_py
def delete_files(files_path):
"""
@summary: 删除文件
---------
@param files_path: 文件路径 py 及 c 文件
---------
@result:
"""
try:
# 删除python文件及c文件
for file in files_path:
os.remove(file) # py文件
os.remove(file.replace(".py", ".c")) # c文件
except Exception as e:
pass
def rename_excrypted_file(output_file_path):
files = walk_file(output_file_path)
for file in files:
if file.endswith(".pyd") or file.endswith(".so"):
new_filename = re.sub("(.*)\..*\.(.*)", r"\1.\2", file)
os.rename(file, new_filename)
def start_encrypt(
input_file_path,
output_file_path: str = None,
ignore_files: Union[List, str, None] = None,
except_main_file: int = 1,
):
assert input_file_path, "input_file_path cannot be null"
assert (
input_file_path != output_file_path
), "output_file_path must be diffent with input_file_path"
if output_file_path and os.path.isfile(output_file_path):
raise ValueError("output_file_path need a dir path")
input_file_path = os.path.abspath(input_file_path)
if not output_file_path: # 无输出路径
if os.path.isdir(
input_file_path
): # 如果输入路径是文件夹 则输出路径为input_file_path/dist/project_name
output_file_path = os.path.join(
input_file_path, "dist", os.path.basename(input_file_path)
)
else:
output_file_path = os.path.join(os.path.dirname(input_file_path), "dist")
else:
output_file_path = os.path.abspath(output_file_path)
# 拷贝原文件到目标文件
copy_files(input_file_path, output_file_path)
files = walk_file(output_file_path)
py_files = get_py_files(files, ignore_files)
# 过滤掉不需要加密的文件
need_encrypted_py = filter_cannot_encrypted_py(py_files, except_main_file)
encrypted_py = encrypt_py(need_encrypted_py)
delete_files(encrypted_py)
rename_excrypted_file(output_file_path)
logger.debug(
"加密完成 total_count={}, success_count={}, 生成到 {}".format(
len(need_encrypted_py), len(encrypted_py), output_file_path
)
)
def usage():
"""
python代码 加密|加固
参数说明:
-i | --input_file_path 待加密文件或文件夹路径,可是相对路径或绝对路径
-o | --output_file_path 加密后的文件输出路径,默认在input_file_path下创建dist文件夹,存放加密后的文件
-I | --ignore_files 不需要加密的文件或文件夹,逗号分隔
-m | --except_main_file 不加密包含__main__的文件(主文件加密后无法启动), 值为0、1。 默认为1
"""
def execute():
try:
options, args = getopt.getopt(
sys.argv[1:],
"hi:o:I:m:",
[
"help",
"input_file_path=",
"output_file_path=",
"ignore_files=",
"except_main_file=",
],
)
input_file_path = output_file_path = ignore_files = ""
except_main_file = 1
for name, value in options:
if name in ("-h", "--help"):
print(usage.__doc__)
sys.exit()
elif name in ("-i", "--input_file_path"):
input_file_path = value
elif name in ("-o", "--output_file_path"):
output_file_path = value
elif name in ("-I", "--ignore_files"):
ignore_files = value.split(",")
elif name in ("-m", "--except_main_file"):
except_main_file = int(value)
if not input_file_path:
print("需指定-i 或 input_file_path")
print(usage.__doc__)
sys.exit()
start_encrypt(input_file_path, output_file_path, ignore_files, except_main_file)
except getopt.GetoptError:
print(usage.__doc__)
sys.exit()
if __name__ == '__main__':
execute()
编译
将上面的源代码保存为 encrypt_py.py
文件,执行。
-I
: 忽略 .venv
encrypt_py.py
和 tests/*
目录
python encrypt_py.py -i . -I ".venv/*,encrypt_py.py,tests/*"
(.venv) (base) ➜ inference_server git:(main) ✗ python encrypt_py.py -i . -I ".venv/*,encrypt_py.py,tests/*"
delete.c:3235:21: warning: fallthrough annotation in unreachable code [-Wunreachable-code-fallthrough]
CYTHON_FALLTHROUGH;
^
delete.c:556:34: note: expanded from macro 'CYTHON_FALLTHROUGH'
#define CYTHON_FALLTHROUGH __attribute__((fallthrough))
^
delete.c:3246:21: warning: fallthrough annotation in unreachable code [-Wunreachable-code-fallthrough]
CYTHON_FALLTHROUGH;
^
delete.c:556:34: note: expanded from macro 'CYTHON_FALLTHROUGH'
#define CYTHON_FALLTHROUGH __attribute__((fallthrough))
中间的警告可以忽略,都是一些声明的方法,但是没有使用而已。
成果
会在源码目录的 dist
下生成对应的编译好的 C
代码。试着运行,发现一点儿问题没有。
(.venv) (base) ➜ inference_server git:(main) ✗ python encrypt_py.py -i . -I ".venv/*,encrypt_py.py,tests/*"
delete.c:3235:21: warning: fallthrough annotation in unreachable code [-Wunreachable-code-fallthrough]
CYTHON_FALLTHROUGH;
^
delete.c:556:34: note: expanded from macro 'CYTHON_FALLTHROUGH'
#define CYTHON_FALLTHROUGH __attribute__((fallthrough))
^
1 warning generated.
inference.c:8168:26: warning: code will never be executed [-Wunreachable-code]
module = PyImport_ImportModuleLevelObject(
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1 warning generated.
inference.c:8168:26: warning: code will never be executed [-Wunreachable-code]
module = PyImport_ImportModuleLevelObject(
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1 warning generated.
(.venv) (base) ➜ inference_server git:(main) ✗ pwd
/Users/bytedance/GolandProjects/inference_server
(.venv) (base) ➜ inference_server git:(main) ✗ cd dist/inference_server
(.venv) (base) ➜ inference_server git:(main) ✗ python main.py
INFO: Will watch for changes in these directories: ['/Users/bytedance/GolandProjects/inference_server/dist/inference_server']
INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
INFO: Started reloader process [50384] using StatReload
INFO: Started server process [50394]
INFO: Waiting for application startup.
2024-09-06 16:48:48,095 DEBUG Using AsyncIOEngine.POLLER as I/O engine
INFO: Application startup complete.
检查是否还包含源码
(.venv) (base) ➜ inference_server git:(main) ✗ tree src/chain/http
src/chain/http
├── __init__.py
├── __pycache__
│ └── __init__.cpython-310.pyc
├── app.so
├── build
│ └── lib.macosx-10.9-universal2-cpython-310
│ └── inference_server
│ └── src
│ └── chain
│ └── http
│ ├── app.so
│ ├── middleware.so
│ └── streaming_response.so
├── middleware.so
└── streaming_response.so
7 directories, 8 files
可以看到已经没有 python
源码啦,都是 so
库文件了。完美~